Imports

In [1]:
!pip install sklearn-som
!pip install minisom
Collecting sklearn-som
  Downloading sklearn_som-1.1.0-py3-none-any.whl (6.7 kB)
Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from sklearn-som) (1.21.6)
Installing collected packages: sklearn-som
Successfully installed sklearn-som-1.1.0
Collecting minisom
  Downloading MiniSom-2.3.0.tar.gz (8.8 kB)
Building wheels for collected packages: minisom
  Building wheel for minisom (setup.py) ... done
  Created wheel for minisom: filename=MiniSom-2.3.0-py3-none-any.whl size=9018 sha256=4ebf8811dca112bd4743d001419f2b5ef9539c6ea9e85eae071f61210402cb1b
  Stored in directory: /root/.cache/pip/wheels/d4/ca/4a/488772b0399fec45ff53132ed14c948dec4b30deee3a532f80
Successfully built minisom
Installing collected packages: minisom
Successfully installed minisom-2.3.0
In [68]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from minisom import MiniSom
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import adjusted_rand_score, silhouette_samples, silhouette_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics.cluster import pair_confusion_matrix
from sklearn_som.som import SOM
from sklearn import preprocessing
from scipy.cluster.hierarchy import dendrogram, linkage

Kernelized Kmeans

In [3]:
import numpy as np

def rbf(x1, x2, gamma = None):
        '''
        RBF: Radial basis function or guassian kernel
        ----------------------------------------------
        :param: x1: NxD transposed feature space
        :param: x2: NxD feature space
        :param: gamma: 1/2(sigma-square)
        :return type: kernel(Gram) matrix
        '''
        if not gamma:
            gamma = 1
        else:
            gamma = gamma
        if x1.ndim == 1 and x2.ndim == 1:
            return np.exp(-gamma * np.linalg.norm(x1 - x2))
        elif (x1.ndim > 1 and x2.ndim == 1) or (x1.ndim == 1 and x2.ndim > 1):
            return np.exp(-gamma * np.linalg.norm(x1 - x2, axis = 1))
        elif x1.ndim > 1 and x2.ndim > 1:
            return np.exp(-gamma * np.linalg.norm(x1[:, np.newaxis] - x2[np.newaxis, :], axis = 2))

class kkMeans:
    def __init__(self, k = None, gamma = None, d = None):
        '''
        :param: k: number of clusters
        :param: kernel
        
        '''
        super().__init__()
        if not gamma:
            gamma = 5
            self.gamma = gamma
        else:
            self.gamma = gamma
        if not d:
            d = 3
            self.d = d
        else:
            self.d = d
        if not k:
            k = 2
            self.k = k
        else:
            self.k = k
        return
    
    def kernelize(self, x1, x2):
        return rbf(x1, x2, gamma = self.gamma)
    
    #compute suqared kernel distance
    def distance(self, kappa):
        self.dist_w = np.zeros(self.k)
        c_k_init = np.ones(kappa.shape[0])
        for ii in range(self.k):
            self.c_k_indices = self.clusters == ii
            self.c_k = np.sum(c_k_init[self.c_k_indices])
            self.c_k_squared = np.square(self.c_k)
            self.kappa_ii = kappa[self.c_k_indices][:, self.c_k_indices]
            self.distance_ii[:, ii] += np.sum(self.kappa_ii)/self.c_k_squared - 2*\
                                    np.sum(kappa[:, self.c_k_indices], axis = 1)/self.c_k
        return self.distance_ii
           
    #fit and return cluster labels
    def fit_predict(self, X, iteration = None, halt = None):
        '''
        :param: X: NxD Feature
        :param: iteration: 100
        :param: tolerance: 1e-3 default
        
        '''
        if not halt:
            halt = 1e-3
            self.halt = halt
        else:
            self.halt = halt
        if not iteration:
            iteration = 100
            self.iteration = iteration
        else:
            self.iteration = iteration
        self.X = X
        N, D = X.shape
        self.distance_ii = np.zeros((N, self.k))
        self.kx = self.kernelize(self.X, self.X)
        self.clusters = np.random.randint(self.k, size = N)
        '''iterate by checking to see if new and previous cluster
        labels are similar. If they are, algorithm converges and halts..
        '''
        for ii in range(self.iteration):
            #compute distance
            self.distance_k = self.distance(self.kx)
            self.prev_clusters = self.clusters
            self.clusters = self.distance_k.argmin(axis=1)
            if 1 - float(np.sum((self.clusters - self.prev_clusters) == 0)) / N < self.halt:
                break
        return self

    def rand_index_score(self, clusters, classes):
        '''Compute the RandIndex
        :param: Clusters: Cluster labels
        :param: classses: Actual class
        :returntype: float
        '''
        from scipy.special import comb
        tp_fp = comb(np.bincount(clusters), 2).sum()
        tp_fn = comb(np.bincount(classes), 2).sum()
        A = np.c_[(clusters, classes)]
        tp = sum(comb(np.bincount(A[A[:, 0] == i, 1]), 2).sum()
                 for i in set(clusters))
        fp = tp_fp - tp
        fn = tp_fn - tp
        tn = comb(len(A), 2) - tp - fp - fn
        self.randindex = (tp + tn) / (tp + fp + fn + tn)
        return self.randindex
In [4]:
data_2d_10c = pd.read_csv("./2d-10c.dat", header=None, sep=' ')[3:]
data_iris = pd.read_csv("./iris.csv", header=None)
data_long = pd.read_csv("./long.data", header=None, sep=' ')
data_order2_3clust = pd.read_csv("./order2-3clust.csv", header=None)
data_smile = pd.read_csv("./smile.csv", header=None)
data_square = pd.read_csv("./square.data", header=None, sep=' ')
data_heart = pd.read_csv("./heart.csv")

scaler = preprocessing.StandardScaler()

data_2d_10c_X = data_2d_10c.to_numpy()[:, :-1]
data_2d_10c_y = data_2d_10c.to_numpy()[:, -1]
data_iris_X = data_iris.to_numpy()[:, :-1]
data_iris_y = data_iris.to_numpy()[:, -1]
data_long_X = data_long.to_numpy()[:, :-1]
data_long_y = data_long.to_numpy()[:, -1]
data_order2_3clust_X = data_order2_3clust.to_numpy()[:, :-1]
data_order2_3clust_y = data_order2_3clust.to_numpy()[:, -1]
data_smile_X = data_smile.to_numpy()[:, :-1]
data_smile_y = data_smile.to_numpy()[:, -1]
data_square_X = data_square.to_numpy()[:, :-1]
data_square_y = data_square.to_numpy()[:, -1]
data_heart_X = data_heart.to_numpy()[:, :-1]
data_heart_y = data_heart.to_numpy()[:, -1]

data_2d_10c_X_normalized = scaler.fit_transform(data_2d_10c_X)
data_iris_X_normalized = scaler.fit_transform(data_iris_X)
data_long_X_normalized = scaler.fit_transform(data_long_X)
data_order2_3clust_X_normalized = scaler.fit_transform(data_order2_3clust_X)
data_smile_X_normalized = scaler.fit_transform(data_smile_X)
data_square_X_normalized = scaler.fit_transform(data_square_X)
data_heart_X_normalized = scaler.fit_transform(data_heart_X)

Dataset 2d-10c

In [ ]:
data_X = data_2d_10c_X
data_X_normalized = data_2d_10c_X_normalized
data_y = data_2d_10c_y
n_components = 10
n_dimensions = 2
In [ ]:
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=data_y)
plt.show()

Single linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='single', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.36306139186145114

Complete linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.5788494654191659

Average linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='average', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.7282505385885099

Ward variance

In [ ]:
clustering = AgglomerativeClustering(linkage='ward', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.8170271521894713

DBSCAN

In [ ]:
clustering = DBSCAN(eps=0.62).fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.9295401997083553

KMeans

In [ ]:
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(data_X_normalized)
predictions = kmeans.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.7337304054637764

Kernelized KMeans

In [ ]:
kkmeans = kkMeans(k = n_components, gamma=0.95).fit_predict(data_X_normalized)
predictions = kkmeans.clusters

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.7396969699703344

Expectation-Maximization (EM) clustering

In [ ]:
gm = GaussianMixture(n_components=n_components, random_state=0).fit(data_X)
predictions = gm.predict(data_X)

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.9982089084808446

Self - organizing maps (SOM)

In [ ]:
som = SOM(m=n_components, n=1, dim=n_dimensions)
som.fit(data_X)
predictions = som.predict(data_X)

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.6853519594673922

For this dataset EM produces the best result with a 99% score for the adjusted rand index. The data contains groups that aren't interwined where EM can easily fit gaussian mixture models for each group, unlike KMeans which splits the data in Voronoi-cells.

Dataset iris

In [7]:
data_X = data_iris_X
data_X_normalized = data_iris_X_normalized
data_y = data_iris_y
n_components = 3
n_dimensions = 4
In [ ]:
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=data_y)
plt.show()

Single linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='single', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.5637510205230709

Complete linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.6422512518362898

Average linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='average', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.7591987071071522

Ward variance

In [ ]:
clustering = AgglomerativeClustering(linkage='ward', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.7311985567707746

DBSCAN

In [ ]:
clustering = DBSCAN(eps=1).fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.5681159420289855

KMeans

In [ ]:
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(data_X)
predictions = kmeans.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.7302382722834697

Kernelized KMeans

In [ ]:
kkmeans = kkMeans(k = n_components, gamma=0.21).fit_predict(data_X)
predictions = kkmeans.clusters

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.7726314170414115

Expectation-Maximization (EM) clustering

In [ ]:
gm = GaussianMixture(n_components=n_components, random_state=0).fit(data_X)
predictions = gm.predict(data_X)

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.9038742317748124

Self - organizing maps (SOM)

In [9]:
target = data_iris.iloc[: , -1].values
label_names = {0:'Iris Setosa', 1:'Iris Versicolour', 2:'Iris Virginica'}
data_y = data_iris[data_iris.columns[-1]]
data = data_iris[data_iris.columns[:-1]]

data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
data = data.values
In [18]:
def plot_som_density(som, data):
  colors = ['C0', 'C1', 'C2']
  w_x, w_y = zip(*[som.winner(d) for d in data])
  w_x = np.array(w_x)
  w_y = np.array(w_y)

  plt.figure(figsize=(10, 9))
  plt.pcolor(som.distance_map().T, cmap='bone_r', alpha=.2)
  plt.colorbar()

  for c in np.unique(target):
      idx_target = target==c
      plt.scatter(w_x[idx_target]+.5+(np.random.rand(np.sum(idx_target))-.5)*.8,
                  w_y[idx_target]+.5+(np.random.rand(np.sum(idx_target))-.5)*.8, 
                  s=50, c=colors[c-1], label=label_names[c])
  plt.legend(loc='center right')
  plt.grid()
  plt.show()

def plot_som_clusters(som, som_shape, data):
  winner_coordinates = np.array([som.winner(x) for x in data]).T
  cluster_index = np.ravel_multi_index(winner_coordinates, som_shape)

  for c in np.unique(cluster_index):
      plt.scatter(data[cluster_index == c, 0],
                  data[cluster_index == c, 1], alpha=.7)

  for centroid in som.get_weights():
      plt.scatter(centroid[:, 0], centroid[:, 1], marker='x', 
                  s=80, linewidths=35, color='k')
      
def classify(som, data_X, data_y):
    winmap = som.labels_map(data_X, data_y)
    default_class = np.sum(list(winmap.values())).most_common()[0][0]
    result = []
    for d in data:
        win_position = som.winner(d)
        if win_position in winmap:
            result.append(winmap[win_position].most_common()[0][0])
        else:
            result.append(default_class)
    return result

def classifyAfterClustering(som, data_X, data_y, neuron_clusters):
  winmap = som.labels_map(data_X, data_y)
  result = []

  for d in data:
      win_position = som.winner(d)
      result.append(neuron_clusters[win_position[0]])

  return result
In [19]:
n_neurons = 3
m_neurons = 3
som = MiniSom(n_neurons, m_neurons, data.shape[1], sigma=0.1, learning_rate=.1, 
              neighborhood_function='gaussian', random_seed=0)

som.pca_weights_init(data)
som.train(data, 1000, verbose=True)

plot_som_density(som, data)
 [ 1000 / 1000 ] 100% - 0:00:00 left 
 quantization error: 0.6320520283464257
In [20]:
plot_som_clusters(som, (n_neurons, m_neurons), data)
In [22]:
predictions = classify(som, data, data_y)

plt.figure(figsize=(8, 8))
plt.scatter(data[:, 0], data[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.7076975836448793
In [25]:
distance_map = som.distance_map().T

kmeans = KMeans(n_clusters=3, random_state=0).fit(distance_map)
neuron_clusters = kmeans.labels_

predictions = classifyAfterClustering(som, data, data_y, neuron_clusters)
plt.figure(figsize=(8, 8))
plt.scatter(data[:, 0], data[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.42574951577062775
In [26]:
n_neurons = 6
m_neurons = 6
som = MiniSom(n_neurons, m_neurons, data.shape[1], sigma=0.1, learning_rate=.1, 
              neighborhood_function='gaussian', random_seed=0)

som.pca_weights_init(data)
som.train(data, 1000, verbose=True)

plot_som_density(som, data)
 [ 1000 / 1000 ] 100% - 0:00:00 left 
 quantization error: 0.5877990167743453
In [27]:
plot_som_clusters(som, (n_neurons, m_neurons), data)
In [28]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(som.distance_map().T)
neuron_clusters = kmeans.labels_

predictions = classifyAfterClustering(som, data, data_y, neuron_clusters)
plt.figure(figsize=(8, 8))
plt.scatter(data[:, 0], data[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.24889189176238327
In [29]:
predictions = classify(som, data, data_y)

plt.figure(figsize=(8, 8))
plt.scatter(data[:, 0], data[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.8681109348039796

For this dataset EM produces the best result with a 90% score for the adjusted rand index.

Dataset long

In [ ]:
data_X = data_long_X
data_X_normalized = data_long_X_normalized
data_y = data_long_y
n_components = 2
n_dimensions = 2
In [ ]:
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=data_y)
plt.show()

Single linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='single', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  1.0

Complete linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.009450908594953032

Average linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='average', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.000200739823873989

Ward variance

In [ ]:
clustering = AgglomerativeClustering(linkage='ward', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  1.0

DBSCAN

In [ ]:
clustering = DBSCAN(eps=0.8, min_samples=1).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  1.0

KMeans

In [ ]:
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(data_X_normalized)
predictions = kmeans.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  1.0

Kernelized KMeans

In [ ]:
kkmeans = kkMeans(k = n_components, gamma=0.21).fit_predict(data_X_normalized)
predictions = kkmeans.clusters

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  1.0

Expectation-Maximization (EM) clustering

In [ ]:
gm = GaussianMixture(n_components=n_components, random_state=0).fit(data_X_normalized)
predictions = gm.predict(data_X_normalized)

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  1.0

Self - organizing maps (SOM)

In [ ]:
som = SOM(m=n_components, n=1, dim=n_dimensions)
som.fit(data_X_normalized)
predictions = som.predict(data_X_normalized)

plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  1.0

All the algorithms except Average and Complete linkage managed to obtain a 100% adjusted rand index, correctly identifying the clusters.

The dataset doesn't contain much noise or outliers. Single linkage performs well since it tends to form chains like the two clusters in our dataset. However, Complete and Average linkage fail since theyre more sensitive to outliers and tend to break up large clusters. KMeans can also separate them into Voronoi cells if the data is normalized, since the axis of the decision line is reversed. The clusters can be also fit by the EM Gaussian mixture models.

Dataset order2-3clust

In [ ]:
data_X = data_order2_3clust_X
data_X_normalized = data_order2_3clust_X_normalized
data_y = data_order2_3clust_y
n_components = 3
n_dimensions = 2
In [ ]:
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=data_y)
plt.show()

Single linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='single', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.5664477456671392

Complete linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.2924572720298255

Average linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='average', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.179046159897769

Ward variance

In [ ]:
clustering = AgglomerativeClustering(linkage='ward', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.31276042541209537

DBSCAN

In [ ]:
clustering = DBSCAN(eps=0.35, min_samples=2).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.7542894292800925

KMeans

In [ ]:
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(data_X_normalized)
predictions = kmeans.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.26661149956331426

Kernelized KMeans

In [ ]:
kkmeans = kkMeans(k = n_components, gamma=1).fit_predict(data_X_normalized)
predictions = kkmeans.clusters

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.5035648207395133

Expectation-Maximization (EM) clustering

In [ ]:
gm = GaussianMixture(n_components=n_components, random_state=0).fit(data_X)
predictions = gm.predict(data_X)

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.3808820562888344

Self - organizing maps (SOM)

In [ ]:
som = SOM(m=n_components, n=1, dim=n_dimensions)
som.fit(data_X)
predictions = som.predict(data_X)

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.23130272100189786

For this dataset DBSCAN produces the best result with a 75% score for the adjusted rand index.

DBSCAN has the advantage of finding arbitrarily-shaped clusters, the only condition is that the clusters shouldn't be connected. However, the points on the cluster borders might swap cluster membership, just like in our result.

Dataset smile

In [ ]:
data_X = data_smile_X
data_X_normalized = data_smile_X_normalized
data_y = data_smile_y
n_components = 4
n_dimensions = 2
In [ ]:
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=data_y)
plt.show()

Single linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='single', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  1.0

Complete linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.2038622216846395

Average linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='average', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.2458668674230882

Ward variance

In [ ]:
clustering = AgglomerativeClustering(linkage='ward', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.20414654709894994

DBSCAN

In [ ]:
clustering = DBSCAN().fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  1.0

KMeans

In [ ]:
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(data_X)
predictions = kmeans.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.1851848367390459

Kernelized KMeans

In [ ]:
kkmeans = kkMeans(k = n_components, gamma=3).fit_predict(data_X)
predictions = kkmeans.clusters

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.4165764595511294

Expectation-Maximization (EM) clustering

In [ ]:
gm = GaussianMixture(n_components=n_components, random_state=0).fit(data_X)
predictions = gm.predict(data_X)

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.19229022827641973

Self - organizing maps (SOM)

In [ ]:
som = SOM(m=n_components, n=1, dim=n_dimensions)
som.fit(data_X)
predictions = som.predict(data_X)

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.19568902092626914

For this dataset Single Linkage and DBSCAN produce the best results with a 100% score for the adjusted rand index.

Single Linkages manages to correctly assign the data to clusters, since it is able to identify the countour of the face and the mouth as they are easily chained together. The same applies to DBSCAN, which can find arbitrarily-shaped clusters even if they are completely surrounded by another cluster, as long as they are not connected.

The other algorithms like EM performed poorly, since its hard for a Gaussian mixture model to correctly fit to the countour of the face, as well as fitting the inner features. The same reasoning applies to KMeans, which wont be able to find clusters that contain another.

Dataset square

In [ ]:
data_X = data_square_X
data_X_normalized = data_square_X_normalized
data_y = data_square_y
n_components = 4
n_dimensions = 2
In [ ]:
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=data_y)
plt.show()

Single linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='single', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  1.3507537471235578e-06

Complete linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.5504838884994412

Average linkage

In [ ]:
clustering = AgglomerativeClustering(linkage='average', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.6037435609973467

Ward variance

In [ ]:
clustering = AgglomerativeClustering(linkage='ward', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.5946287023125574

DBSCAN

In [ ]:
clustering = DBSCAN().fit(data_X)
predictions = clustering.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.2042399389303871

KMeans

In [ ]:
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(data_X)
predictions = kmeans.labels_

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.6931800370598739

Kernelized KMeans

In [ ]:
kkmeans = kkMeans(k = n_components, gamma=0.1).fit_predict(data_X)
predictions = kkmeans.clusters

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.652793478052305

Expectation-Maximization (EM) clustering

In [ ]:
gm = GaussianMixture(n_components=n_components, random_state=0).fit(data_X)
predictions = gm.predict(data_X)

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.6843772646953642

Self - organizing maps (SOM)

In [ ]:
som = SOM(m=n_components, n=1, dim=n_dimensions)
som.fit(data_X)
predictions = som.predict(data_X)

plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()

print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
Adjusted rand index:  0.37546057562697427

For this dataset KMeans produces the best result with a 69% score for the adjusted rand index.

The Kernelized KMeans and EM algorithms also managed to get a 68% score for the adjusted rand index. The dataset contains a lot of noise which makes it hard for the algorithms to identify clusters, especially for those where we have to specify the number of clusters beforehand.

Determining the optimal number of clusters on iris

KMeans

In [81]:
def plot_silhouette_kmeans(data):
  for n_clusters in range(2, 11):
      fig, (ax1, ax2) = plt.subplots(1, 2)
      fig.set_size_inches(18, 7)

      ax1.set_xlim([-1, 1])
      ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])

      clusterer = KMeans(n_clusters=n_clusters, random_state=0)
      cluster_labels = clusterer.fit_predict(data)

      silhouette_avg = silhouette_score(data, cluster_labels)
      print(
          "For n_clusters =",
          n_clusters,
          "The average silhouette_score is :",
          silhouette_avg,
      )

      sample_silhouette_values = silhouette_samples(data, cluster_labels)

      y_lower = 10
      for i in range(n_clusters):
          ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

          ith_cluster_silhouette_values.sort()

          size_cluster_i = ith_cluster_silhouette_values.shape[0]
          y_upper = y_lower + size_cluster_i

          color = cm.nipy_spectral(float(i) / n_clusters)
          ax1.fill_betweenx(
              np.arange(y_lower, y_upper),
              0,
              ith_cluster_silhouette_values,
              facecolor=color,
              edgecolor=color,
              alpha=0.7,
          )

          ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

          y_lower = y_upper + 10

      ax1.set_title("The silhouette plot for the various clusters.")
      ax1.set_xlabel("The silhouette coefficient values")
      ax1.set_ylabel("Cluster label")

      ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

      ax1.set_yticks([])
      ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

      colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
      ax2.scatter(
          data[:, 0],
          data[:, 1],
          marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
      )

      centers = clusterer.cluster_centers_
      ax2.scatter(
          centers[:, 0],
          centers[:, 1],
          marker="o",
          c="white",
          alpha=1,
          s=200,
          edgecolor="k",
      )

      for i, c in enumerate(centers):
          ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")

      ax2.set_title("The visualization of the clustered data.")
      ax2.set_xlabel("Feature space for the 1st feature")
      ax2.set_ylabel("Feature space for the 2nd feature")

      plt.suptitle(
          "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
          % n_clusters,
          fontsize=14,
          fontweight="bold",
      )

  plt.show()
In [82]:
plot_silhouette_kmeans(data_iris_X)
For n_clusters = 2 The average silhouette_score is : 0.6808136202936816
For n_clusters = 3 The average silhouette_score is : 0.5525919445499757
For n_clusters = 4 The average silhouette_score is : 0.4978256901095472
For n_clusters = 5 The average silhouette_score is : 0.49285559183161204
For n_clusters = 6 The average silhouette_score is : 0.36820569682713084
For n_clusters = 7 The average silhouette_score is : 0.3600597997328459
For n_clusters = 8 The average silhouette_score is : 0.3538815380353622
For n_clusters = 9 The average silhouette_score is : 0.3407785831583716
For n_clusters = 10 The average silhouette_score is : 0.33380799465469013

Intra-class variances

In [6]:
distorsions = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data_iris_X)
    distorsions.append(kmeans.inertia_ / data_iris_X.shape[1])

fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 11), distorsions)
plt.xticks(range(2, 11))
plt.grid(True)
plt.title('Elbow curve')
plt.show()
In [ ]:
distorsions = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data_iris_X)
    predictions = kmeans.labels_

    distorsions.append(adjusted_rand_score(data_iris_y, predictions))

fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 11), distorsions)
plt.xticks(range(2, 11))
plt.grid(True)
plt.title('Adjusted rand score')
plt.show()

The Silhouette Width score on KMeans shows that the best number of clusters should be 2. However, studying the silhouette plot, we can see that even for 3 clusters there aren't any points that were assigned to the wrong cluster. Moreover, the points aren't predominately grouped in the first cluster like in the first plot, instead being more evenly distributed among the 3 clusters. This intuition is confirmed by the intra-class variance and the adjusted rand score plots that show a degradation starting from 3 clusters instead of 2. However, the adjusted rand score contains the ground truth so it is normal that is suggest 3 clusters.

Hierarchical

Plotting functions

In [84]:
def plot_dendrogram(metric, data):
  linked = linkage(data, metric)

  labelList = list(range(len(data)))

  fig = plt.figure(figsize=(10, 7))
  fig.patch.set_facecolor('white')

  dendrogram(linked,
            orientation='top',
            labels=labelList,
            distance_sort='descending',
            show_leaf_counts=True)
  plt.show()

def plot_silhouette(metric, data):
    for n_clusters in range(2, 11):
      fig, (ax1, ax2) = plt.subplots(1, 2)
      fig.set_size_inches(18, 7)

      ax1.set_xlim([-1, 1])
      ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])

      clusterer = AgglomerativeClustering(linkage=metric, n_clusters=n_clusters).fit(data)
      cluster_labels = clusterer.labels_

      silhouette_avg = silhouette_score(data, cluster_labels)
      print(
          "For n_clusters =",
          n_clusters,
          "The average silhouette_score is :",
          silhouette_avg,
      )

      sample_silhouette_values = silhouette_samples(data, cluster_labels)

      y_lower = 10
      for i in range(n_clusters):
          ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

          ith_cluster_silhouette_values.sort()

          size_cluster_i = ith_cluster_silhouette_values.shape[0]
          y_upper = y_lower + size_cluster_i

          color = cm.nipy_spectral(float(i) / n_clusters)
          ax1.fill_betweenx(
              np.arange(y_lower, y_upper),
              0,
              ith_cluster_silhouette_values,
              facecolor=color,
              edgecolor=color,
              alpha=0.7,
          )

          ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

          y_lower = y_upper + 10

      ax1.set_title("The silhouette plot for the various clusters.")
      ax1.set_xlabel("The silhouette coefficient values")
      ax1.set_ylabel("Cluster label")

      ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

      ax1.set_yticks([])
      ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

      colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
      ax2.scatter(
          data[:, 0],
          data[:, 1],
          marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
      )

      ax2.set_title("The visualization of the clustered data.")
      ax2.set_xlabel("Feature space for the 1st feature")
      ax2.set_ylabel("Feature space for the 2nd feature")

      plt.suptitle(
          f"Silhouette analysis for {metric} linkage on sample data with n_clusters = %d"
          % n_clusters,
          fontsize=14,
          fontweight="bold",
      )

    plt.show()

Single linkage

In [85]:
plot_silhouette('single', data_iris_X)
For n_clusters = 2 The average silhouette_score is : 0.6863930543445408
For n_clusters = 3 The average silhouette_score is : 0.5118387098922373
For n_clusters = 4 The average silhouette_score is : 0.28177774044073334
For n_clusters = 5 The average silhouette_score is : 0.2834836728123189
For n_clusters = 6 The average silhouette_score is : 0.2211144115880162
For n_clusters = 7 The average silhouette_score is : 0.13143138174943914
For n_clusters = 8 The average silhouette_score is : 0.05737112103104247
For n_clusters = 9 The average silhouette_score is : 0.02109854792601768
For n_clusters = 10 The average silhouette_score is : 0.02381607934946184
In [86]:
plot_dendrogram('single', data_iris_X)

Complete Linkage

In [87]:
plot_silhouette('complete', data_iris_X)
For n_clusters = 2 The average silhouette_score is : 0.5160600757054667
For n_clusters = 3 The average silhouette_score is : 0.5133502348448076
For n_clusters = 4 The average silhouette_score is : 0.4995741149430846
For n_clusters = 5 The average silhouette_score is : 0.34999771139358965
For n_clusters = 6 The average silhouette_score is : 0.34205174735616767
For n_clusters = 7 The average silhouette_score is : 0.33362901571416487
For n_clusters = 8 The average silhouette_score is : 0.32788913694554
For n_clusters = 9 The average silhouette_score is : 0.3230322341376915
For n_clusters = 10 The average silhouette_score is : 0.3067143646434455
In [88]:
plot_dendrogram('complete', data_iris_X)

Average linkage

In [89]:
plot_silhouette('average', data_iris_X)
For n_clusters = 2 The average silhouette_score is : 0.6863930543445408
For n_clusters = 3 The average silhouette_score is : 0.5539343898551252
For n_clusters = 4 The average silhouette_score is : 0.47166531675930684
For n_clusters = 5 The average silhouette_score is : 0.4303416822141368
For n_clusters = 6 The average silhouette_score is : 0.34065865237555976
For n_clusters = 7 The average silhouette_score is : 0.36941067750646717
For n_clusters = 8 The average silhouette_score is : 0.3645436084191249
For n_clusters = 9 The average silhouette_score is : 0.31677604290958056
For n_clusters = 10 The average silhouette_score is : 0.3083804867355036
In [90]:
plot_dendrogram('average', data_iris_X)

Ward Variance

In [91]:
plot_silhouette('ward', data_iris_X)
For n_clusters = 2 The average silhouette_score is : 0.6863930543445408
For n_clusters = 3 The average silhouette_score is : 0.5540972908150553
For n_clusters = 4 The average silhouette_score is : 0.4887398491004218
For n_clusters = 5 The average silhouette_score is : 0.4841553526240121
For n_clusters = 6 The average silhouette_score is : 0.3618845152256815
For n_clusters = 7 The average silhouette_score is : 0.34485408962742314
For n_clusters = 8 The average silhouette_score is : 0.3462405312208591
For n_clusters = 9 The average silhouette_score is : 0.33313850659637134
For n_clusters = 10 The average silhouette_score is : 0.32916031163928877
In [92]:
plot_dendrogram('ward', data_iris_X)

For hierarchical clustering with the silhouette score shows again that the number of clusters should be 2. Again, further inspecting the silhouette plot, we can see that for 3 clusters we have less points assigned to the wrong cluster for every method except Single Linkage, as well as them being more evenly distributed among clusters.

The dendograms show that theres a significant distance between the green and the red clusters. We could conclude that the number of clusters should be 2, since if we were to cut the dendogram further down the distances aren't that significant. However, if we were to correlate the dendogram to the scatter plot, we would see that the red cluster is formed by two other ones that are really close.

Determining the optimal number of clusters on heart dataset

KMeans

In [ ]:
distorsions = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k).fit(data_heart_X_normalized)
    predictions = kmeans.labels_

    cm = confusion_matrix(data_heart_y, predictions)
    cmd = ConfusionMatrixDisplay(cm)
    cmd.plot()
    print(f"Adjusted rand index {k} clusters: ", adjusted_rand_score(data_y, predictions))

    distorsions.append(kmeans.inertia_ / data_iris_X.shape[1])

fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 11), distorsions)
plt.xticks(range(2, 11))
plt.grid(True)
plt.title('Elbow curve')
plt.show()
Adjusted rand index 2 clusters:  0.3867962148223543
Adjusted rand index 3 clusters:  0.2608992987661521
Adjusted rand index 4 clusters:  0.1891379292064363
Adjusted rand index 5 clusters:  0.19231034119845103
Adjusted rand index 6 clusters:  0.16346452145067147
Adjusted rand index 7 clusters:  0.12153815835272794
Adjusted rand index 8 clusters:  0.11181766770089618
Adjusted rand index 9 clusters:  0.10162732279287125
Adjusted rand index 10 clusters:  0.09627148211919576
In [93]:
plot_silhouette_kmeans(data_heart_X_normalized)
For n_clusters = 2 The average silhouette_score is : 0.16823269494218376
For n_clusters = 3 The average silhouette_score is : 0.1179410931646522
For n_clusters = 4 The average silhouette_score is : 0.12067674216721785
For n_clusters = 5 The average silhouette_score is : 0.11826140058143636
For n_clusters = 6 The average silhouette_score is : 0.11695152381863323
For n_clusters = 7 The average silhouette_score is : 0.10773522041083566
For n_clusters = 8 The average silhouette_score is : 0.11646980287021377
For n_clusters = 9 The average silhouette_score is : 0.0935385659036198
For n_clusters = 10 The average silhouette_score is : 0.12004203906694347

Hierarchical

Single linkage

In [94]:
plot_silhouette('single', data_heart_X_normalized)
For n_clusters = 2 The average silhouette_score is : 0.280124891137619
For n_clusters = 3 The average silhouette_score is : 0.2615913059624888
For n_clusters = 4 The average silhouette_score is : 0.2346155025734294
For n_clusters = 5 The average silhouette_score is : 0.16971376443649502
For n_clusters = 6 The average silhouette_score is : 0.1350794425902002
For n_clusters = 7 The average silhouette_score is : 0.12838974811829199
For n_clusters = 8 The average silhouette_score is : 0.12798159897769376
For n_clusters = 9 The average silhouette_score is : 0.097725288993442
For n_clusters = 10 The average silhouette_score is : 0.08969422406015466
In [95]:
plot_dendrogram('single', data_heart_X_normalized)

Complete linkage

In [96]:
plot_silhouette('complete', data_heart_X_normalized)
For n_clusters = 2 The average silhouette_score is : 0.3532620013624347
For n_clusters = 3 The average silhouette_score is : 0.17357978973182858
For n_clusters = 4 The average silhouette_score is : 0.11248832648026134
For n_clusters = 5 The average silhouette_score is : 0.10801947112149632
For n_clusters = 6 The average silhouette_score is : 0.0863561668608298
For n_clusters = 7 The average silhouette_score is : 0.07737629686164536
For n_clusters = 8 The average silhouette_score is : 0.07808919979377997
For n_clusters = 9 The average silhouette_score is : 0.07955188479783823
For n_clusters = 10 The average silhouette_score is : 0.07910149679111246
In [97]:
plot_dendrogram('complete', data_heart_X_normalized)

Average linkage

In [98]:
plot_silhouette('average', data_heart_X_normalized)
For n_clusters = 2 The average silhouette_score is : 0.3532620013624347
For n_clusters = 3 The average silhouette_score is : 0.2769128750661414
For n_clusters = 4 The average silhouette_score is : 0.24073303792557374
For n_clusters = 5 The average silhouette_score is : 0.18489891196231598
For n_clusters = 6 The average silhouette_score is : 0.14684647467684017
For n_clusters = 7 The average silhouette_score is : 0.12997293454211462
For n_clusters = 8 The average silhouette_score is : 0.10618702599061583
For n_clusters = 9 The average silhouette_score is : 0.08870427712985438
For n_clusters = 10 The average silhouette_score is : 0.07469494381583255
In [99]:
plot_dendrogram('average', data_heart_X_normalized)

Ward Variance

In [100]:
plot_silhouette('ward', data_heart_X_normalized)
For n_clusters = 2 The average silhouette_score is : 0.1325316617259095
For n_clusters = 3 The average silhouette_score is : 0.09737426014314339
For n_clusters = 4 The average silhouette_score is : 0.10362979874322531
For n_clusters = 5 The average silhouette_score is : 0.10367930498325308
For n_clusters = 6 The average silhouette_score is : 0.112745906869604
For n_clusters = 7 The average silhouette_score is : 0.09651350891265457
For n_clusters = 8 The average silhouette_score is : 0.09966833037667293
For n_clusters = 9 The average silhouette_score is : 0.10675366267587831
For n_clusters = 10 The average silhouette_score is : 0.11108210371009258
In [101]:
plot_dendrogram('ward', data_heart_X_normalized)